YJNRC18

This is a report on TRD analyses of cross YJNRC18.

# get the data

library(tidytable)
library(ggplot2)
library(naturalsort)
library(scales)
library(stringr)

AF_across_genome<-fread(paste0("~/data/TRD/results/shiny/", cross, "-AF.csv.gz"))

TRD_regions<-fread(paste0("~/data/TRD/results/shiny/", cross, "-TRD_regions.csv.gz"))

1 The two parents

# load the Excel crosses file

crosses_xlsx=readxl::read_xlsx("~/data/trd/Crosses.xlsx", sheet=2)
## New names:
## * `Is in Stock` -> `Is in Stock...4`
## * `Is in Stock` -> `Is in Stock...6`
cc=data.frame("Cross ID"=c(paste0("ChrisC",1:8)),
                                  "Short name 1"=c("ACP","BAP","CCD","ATE","ACK","AKE","BAH","ANG"),
                                  "Short name 2"=c("BFP","CMP","CPG","SACE_YCR","ACV","BAH","CGD","CEI"), stringsAsFactors=FALSE)
colnames(cc)=str_replace_all(colnames(cc), fixed("."), " ")
crosses_all <- bind_rows(crosses_xlsx[,c("Cross ID","Short name 1","Short name 2")],
                        cc)
                        
crosses_all_slice<-filter(crosses_all, `Cross ID` == cross)
strain_1<-pull(crosses_all_slice, `Short name 1`)
strain_2<-pull(crosses_all_slice, `Short name 2`)

count_segregants<-NA

if(cross %in% pull(crosses_xlsx, `Cross ID`)){
    count_segregants<-filter(crosses_xlsx, `Cross ID` == cross)%>%
                    pull(`Colonies in pool`)
}

# load the Victor clusters

df_Strains<-fread("~/TRD/Shiny/data/Victor/operationalTable_Full2543Sace_Clades.csv")

cluster_1<-filter(df_Strains, StandardizedName == strain_1)%>%pull(Clade)
cluster_2<-filter(df_Strains, StandardizedName == strain_2)%>%pull(Clade)

# load the ASD

ASD_comparisons<-fread("~/data/TRD/comparisons.csv")
ASD_value<-filter(ASD_comparisons, (row==strain_1 & col == strain_2) |
                                    (row==strain_2 & col==strain_1))%>%
                                    pull(dist)
ASD_value<-unique(ASD_value)

ASD_translated_value<-ASD_value*(1554384/12000000)
ASD_translated_value<-round(ASD_translated_value*100,2)

Cross YJNRC18 was done between ACI (NA) and AKI (NA). I managed to pool 1256.

The parental strains had an allele sharing distance of 0.040594, which I think should translate into 0.53% distance between the genomes.

2 TRD across genome

# code below from TRD/02_TRD/01_getTRDs-GATK.R

chrs <- summarise(group_by(AF_across_genome, chr), maxpos = max(pos))
chrs <- chrs[naturalorder(chrs$chr), ]
chrs$global_pos <- cumsum(chrs$maxpos)

p<-ggplot(AF_across_genome, aes(global_pos, AD_A1 / sumCount)) +
  geom_point(alpha = 0.1, color = "grey") +
  geom_line(mapping = aes(global_pos, smoothed, color = abs(0.5 - smoothed)), inherit.aes = FALSE, linewidth = 2) +
  scale_color_viridis_c(option = "A", limits = c(0, 0.5)) +
  ylim(c(0, 1)) +
  geom_hline(yintercept = 0.5) +
  geom_vline(xintercept = chrs$global_pos) +
  geom_vline(xintercept=c(pull(TRD_regions,global_start),pull(TRD_regions,global_end)), color="red", linetype=2, alpha=0.5)+
  theme_bw(16) +
  ylab("Allele Frequency") +
  xlab("POS") +
  theme(legend.position = "none") +
  # geom_hline(yintercept = c(0.4,0.6))+
  ggtitle(cross) +
  labs(alpha = "Coverage") +
  scale_x_continuous(labels = comma)
  
TRD_regions<-mutate(TRD_regions,
                    length90=NA,
                    length80=NA,
                    length70=NA,
                    length60=NA,
                    maxStrength=NA)
                    
for(i in 1:nrow(TRD_regions)){
    TRD_regions_slice<-slice(TRD_regions, i)
    AF_across_genome_here<-filter(AF_across_genome, chr==pull(TRD_regions_slice, chr_start) & global_pos >= pull(TRD_regions_slice, global_start) & global_pos <= pull(TRD_regions_slice, global_end))
    
    TRD_regions<-mutate(TRD_regions,
                        length90=ifelse(ID==pull(TRD_regions_slice,ID),sum((pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))>=0.9 | 
                        (pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))<= 0.1),length90),
                        length80=ifelse(ID==pull(TRD_regions_slice,ID),sum((pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))>=0.8 | 
                        (pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))<= 0.2),length80),
                        length70=ifelse(ID==pull(TRD_regions_slice,ID),sum((pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))>=0.7 | 
                        (pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))<= 0.3),length70),
                        length60=ifelse(ID==pull(TRD_regions_slice,ID),sum((pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))>=0.6 | 
                        (pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))<= 0.4),length60),
              maxStrength=ifelse(ID==pull(TRD_regions_slice,ID),ifelse(max(pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))>max(pull(AF_across_genome_here,AD_A2)/pull(AF_across_genome_here,sumCount)),
                        max((pull(AF_across_genome_here,AD_A1)/pull(AF_across_genome_here,sumCount))),max((pull(AF_across_genome_here,AD_A2)/pull(AF_across_genome_here,sumCount)))),maxStrength))
}
p

2.1 List of TRD regions

The order is the same as highlighted in the plot above. IDs don’t necessarily increase continuously.

library(reactable)

reactable(select(TRD_regions, ID, lengthSNPs, length90,
                    length80,
                    length70,
                    length60, lengthBp, 
                    maxStrength, chr_start, chr_end, global_start, global_end))

3 Structural variants

sv_plots_files<-paste0("~/data/trd/SV_analysis/",cross,".",c("sv_victor",
"syri_1","syri_2","mash1","mash2"),".RDS")

get_plots_if_they_exist<-function(x){
    if(file.exists(x)){
        return(readRDS(x))
    }else{
        return(NA)
    }
}

sv_plots<-lapply(sv_plots_files, get_plots_if_they_exist)
names(sv_plots)<-c("sv_victor",
"syri_1","syri_2","mash1","mash2")

3.1 Victor’s called SVs between parental strains

It is worth noting here that the sizes of SVs can be quite different between the parental strains and what has been merged on a population-level and is then shown here. Later, I will show quickly called SVs just between the two respective assemblies that one can then compare visually.

“TRD” regions added as well.

sv_plots[["sv_victor"]]+theme_bw(12)

3.2 Re-called SVs using syri

I called SVs then between the two respective assemblies. Here I will show two plots, one from the “perspective” of parent 1 and one from parent 2 (i.e. with their coordinates on their chromosomes). Note: The chromosomes were ordered naturally, but since some are concatenated (due to translocations likely), ordering could be less-than-perfect.

sv_plots[["syri_1"]]+theme_bw(12)+ylab("TYPE")+labs(color="TYPE")

sv_plots[["syri_2"]]+theme_bw(12)+ylab("TYPE")+labs(color="TYPE")

3.3 Dotplots

To help us understand some SVs better, I also produced dotplots, from both parents perspectives.

sv_plots[["mash1"]]+theme_bw(12)

sv_plots[["mash2"]]+theme_bw(12)

  • Summary per TRD of % overlaps
SV_data<-readRDS(paste0("/home/jnrunge/data/trd/SV_analysis/",cross,
                    ".SV_data.RDS"))
                    
                    
reactable(SV_data%>%filter(!is.na(TYPE))%>%
group_by(cross,ID,TYPE,source)%>%summarize(sum_LEN_rel=sum(LEN_rel),
sum_LEN=sum(LEN))%>%arrange(-sum_LEN))
## `summarise()` has grouped output by 'cross', 'ID', 'TYPE'. You can
## override using the `.groups` argument.

4 Local phylogeny

  • Summary
LP_data<-fread("/home/jnrunge/data/trd/local_phylogenies_trd_analysis/TRD_regions_with_LP_data.csv.gz")

cross_val<-cross

LP_data<-filter(LP_data,cross==cross_val)%>%select(ID,PCA_eucldist_quantile_1,PCA_eucldist_sd_multiplier_1,IBS_eucldist_quantile_1,IBS_eucldist_sd_multiplier_1,tree_changes_quantile,tree_changes_sd_multiplier,PCA_eucldist_quantile_2,PCA_eucldist_sd_multiplier_2,IBS_eucldist_quantile_2,IBS_eucldist_sd_multiplier_2)

LP_data<-pivot_longer(LP_data, -ID)

LP_data<-mutate(LP_data, value=round(value,2), abs=round(abs(value),2))

reactable(LP_data%>%arrange(-abs))
  • Plots

4.1 PCA / IBS-MDS

First, we can visualize the PCA and IBS-MDS of the TRD region across the 2.5K strains, highlighting the two crossed strains, blue for strain 1 and red for strain 2 (referring to up and down in the TRD plot).

for(id in pull(TRD_regions, ID)){
    if(!file.exists(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".","PCA",".RDS"))){
                next
                }
    cat("### TRD region ",id, "  ")
    cat("\n") 
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".","PCA",".RDS")))
              cat("\n")   
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".","IBS MDS",".RDS")))
                cat("\n\n")   
}

4.1.1 TRD region 1

4.1.2 TRD region 3

4.1.3 TRD region 4

4.2 Distance plots

Next, we want to quantify how much the TRD regions are more or less distant to other strains in the PCA / IBS-MDS above compared to the rest of the strains’s genomes.

The red lines indicate the distance to the midpoint in the PCA or IBS-MDS of the TRD region, i.e. how different the strain’s region is compared to the average in the 2.5K dataset. To put it into perspective, 10,000 random 50 kB regions were drawn and the focal strain’s distance to the mean was taken for those as well. These random regions form the histogram in the background. The blue line shows a normal distribution of the same mean and sd as the random values. Hence, a strongly shifted red lined (shifted from median in the histogram and/or mean in the blue line) would indicate a particularly average (low value) or particularly distance (high value) TRD region.

for(id in pull(TRD_regions, ID)){
    if(!file.exists(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".",strain_1,".","PCA",".RDS"))){
                next
                }
    cat("### TRD region ",id, " in strain 1, ", strain_1, "  ")
    cat("\n") 
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".",strain_1,".","PCA",".RDS")))
              cat("\n")   
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".",strain_1,".","IBS",".RDS")))
                cat("\n\n")   
                
                cat("### TRD region ",id, " in strain 2, ", strain_2, "  ")
    cat("\n") 
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".",strain_2,".","PCA",".RDS")))
              cat("\n")   
print(readRDS(paste0("~/data/trd/local_phylogenies_trd_analysis/",
                cross,".",id,".",strain_2,".","IBS",".RDS")))
                cat("\n\n")   
}

4.2.1 TRD region 1 in strain 1, ACI

4.2.2 TRD region 1 in strain 2, AKI

4.2.3 TRD region 3 in strain 1, ACI

4.2.4 TRD region 3 in strain 2, AKI

4.2.5 TRD region 4 in strain 1, ACI

4.2.6 TRD region 4 in strain 2, AKI

4.3 Changes in the local phylogeny

  • needs to be re-analyzed as basically all TRD regions were low outliers, implying that the 50 kB region size of the random regions determined more changes than the often larger TRD regions.

5 Population genomics

5.1 Nucleotide diversity

for(id in pull(TRD_regions, ID)){
    if(!file.exists(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))){
    next
    }
    rel_measures<-readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))
    cat("### TRD region ",id)
    cat("\n") 
    cat(paste0("The distorter-like strains have a pi of ", rel_measures$pi_distorter$pi_pop, " in the TRD region. That is the ", 
    rel_measures$pi_distorter$ecdf_value, " percentile, or a move of ", rel_measures$pi_distorter$standardized_measure, " SD from the mean, compared to the average of the distorters-like strains genome-wide."))
    cat("\n")
    cat(paste0("The other strains have a pi of ", rel_measures$pi_other$pi_pop, " in the TRD region. That is the ", 
    rel_measures$pi_other$ecdf_value, " percentile, or a move of ", rel_measures$pi_other$standardized_measure, " SD from the mean, compared to the average of the distorters-like strains genome-wide."))
    cat("\n")
    print(readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-pi-plot.RDS")))
    cat("\n")
    cat("\n")
     
}

5.1.1 TRD region 1

The distorter-like strains have a pi of 0.0020369631844542 in the TRD region. That is the 0.49078843971012 percentile, or a move of -0.0821699322275691 SD from the mean, compared to the average of the distorters-like strains genome-wide. The other strains have a pi of 0.0046581784424804 in the TRD region. That is the 0.784597921941849 percentile, or a move of 0.0161326385397789 SD from the mean, compared to the average of the distorters-like strains genome-wide.

## Warning: Transformation introduced infinite values in continuous
## x-axis
## Warning: Removed 1208 rows containing non-finite values
## (`stat_bin()`).

5.1.2 TRD region 3

The distorter-like strains have a pi of 0.0002700876360526 in the TRD region. That is the 0.0399161498820858 percentile, or a move of -0.280169977066614 SD from the mean, compared to the average of the distorters-like strains genome-wide. The other strains have a pi of 0.0034005445397593 in the TRD region. That is the 0.550947350039291 percentile, or a move of -0.0615516573944238 SD from the mean, compared to the average of the distorters-like strains genome-wide.

## Warning: Transformation introduced infinite values in continuous
## x-axis
## Warning: Removed 1244 rows containing non-finite values
## (`stat_bin()`).

5.2 Fst

for(id in pull(TRD_regions, ID)){
if(!file.exists(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))){
    next
    }
    rel_measures<-readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))
    cat("### TRD region ",id)
    cat("\n") 
    cat(paste0("The population differentiation of distorter-like and other strains at the TRD locus is ", rel_measures$fst$sum_value, ". That is the ", 
    rel_measures$fst$ecdf_value, " percentile, or a move of ", rel_measures$fst$standardized_measure, " SD from the mean, compared to the average genome-wide."))
    cat("\n")
    print(readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-Fst-plot.RDS")))
    cat("\n")
     cat("\n")
}

5.2.1 TRD region 1

The population differentiation of distorter-like and other strains at the TRD locus is 0.348156284255818. That is the 0.687860262008734 percentile, or a move of 0.464829170101667 SD from the mean, compared to the average genome-wide.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.2.2 TRD region 3

The population differentiation of distorter-like and other strains at the TRD locus is 0.321642827005776. That is the 0.481222707423581 percentile, or a move of -0.0176319428171366 SD from the mean, compared to the average genome-wide.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.3 Dxy

for(id in pull(TRD_regions, ID)){
if(!file.exists(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))){
    next
    }
    rel_measures<-readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-relative-measures.RDS"))
    cat("### TRD region ",id)
    cat("\n") 
    cat(paste0("The nucleotide divergence of distorter-like and other strains at the TRD locus is ", rel_measures$dxy$sum_value, ". That is the ", 
    rel_measures$dxy$ecdf_value, " percentile, or a move of ", rel_measures$dxy$standardized_measure, " SD from the mean, compared to the average genome-wide."))
    cat("\n")
    print(readRDS(paste0("/home/jnrunge/data/trd/pop_genomics/",cross,".",id,"-Dxy-plot.RDS")))
    cat("\n")
     cat("\n")
}

5.3.1 TRD region 1

The nucleotide divergence of distorter-like and other strains at the TRD locus is 0.0058422917373723. That is the 0.75333973631363 percentile, or a move of 0.0199770410364193 SD from the mean, compared to the average genome-wide.

## Warning: Transformation introduced infinite values in continuous
## x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 573 rows containing non-finite values (`stat_bin()`).

5.3.2 TRD region 3

The nucleotide divergence of distorter-like and other strains at the TRD locus is 0.0039625111276071. That is the 0.447113285003057 percentile, or a move of -0.10097801752658 SD from the mean, compared to the average genome-wide.

## Warning: Transformation introduced infinite values in continuous
## x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 577 rows containing non-finite values (`stat_bin()`).

5.4 Tajima’s D

To understand whether the TRD region is showing signals of selection across all strains, I also calculated Tajima’s D. Note that this is not on a subset of e.g. distorter-like strains, but across all strains of the 2.5K matrix.

TD_values<-fread("/home/jnrunge/data/trd/TD_data/TD_plot_data.csv.gz")
cross_value<-cross
for(id in pull(TRD_regions, ID)){
    TD_values_slice<-filter(TD_values, cross==cross_value, ID==id)
    cat("### TRD region ",id)
    cat("\n") 
    cat(paste0("The TD value in the TRD region across all strains is ", pull(TD_values_slice, TD), ", which represents a percentile of ",
    pull(TD_values_slice, TD_ecdf), " and an SD shift of ", pull(TD_values_slice, TD_sd_multi)))
     cat("\n") 
    print(readRDS(paste0("/home/jnrunge/data/trd/TD_data","/",cross,".", id,".",
                     "TD_plot",".RDS")))
     cat("\n") 
      cat("\n") 

}

5.4.1 TRD region 1

The TD value in the TRD region across all strains is -2.10994697151443, which represents a percentile of 0.87040274322598 and an SD shift of 0.820980344445748

5.4.2 TRD region 3

The TD value in the TRD region across all strains is -2.26462985427465, which represents a percentile of 0.393573925906004 and an SD shift of -0.320560842616271

5.4.3 TRD region 4

The TD value in the TRD region across all strains is -2.31241604942311, which represents a percentile of 0.188260606468097 and an SD shift of -0.673217251402659

5.5 Linkage disequillibrium

Another important hint could be linkage disequillibrium, which I have also calculated across all strains. Note that this is not on a subset of e.g. distorter-like strains, but across all strains of the 2.5K matrix.

LD_values<-fread("/home/jnrunge/data/trd/LD_data/LD_plot_data.csv.gz")
cross_value<-cross
for(id in pull(TRD_regions, ID)){
    LD_values_slice<-filter(LD_values, cross==cross_value, ID==id)
    cat("### TRD region ",id)
    cat("\n") 
    cat(paste0("The LD value in the TRD region across all strains is ", pull(LD_values_slice, LD), " (R^2), which represents a percentile an SD shift of ", pull(LD_values_slice, LD_sd_multi_1), " compared to the lower-value group and a shift of ", pull(LD_values_slice, LD_sd_multi_2), " compared to the higher value group (LD is bimodally distributed)"))
     cat("\n") 
    print(readRDS(paste0("/home/jnrunge/data/trd/LD_data","/",cross,".", id,".",
                     "LD_plot",".RDS")))
     cat("\n") 
      cat("\n") 

}

5.5.1 TRD region 1

The LD value in the TRD region across all strains is 0.0647048808640902 (R^2), which represents a percentile an SD shift of 4.26012977226748 compared to the lower-value group and a shift of 0.164908474527954 compared to the higher value group (LD is bimodally distributed)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.5.2 TRD region 3

The LD value in the TRD region across all strains is 0.0455407130325358 (R^2), which represents a percentile an SD shift of 3.20129897283394 compared to the lower-value group and a shift of -0.524920841658253 compared to the higher value group (LD is bimodally distributed)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.5.3 TRD region 4

The LD value in the TRD region across all strains is 0.0391695220077175 (R^2), which represents a percentile an SD shift of 2.74696970520364 compared to the lower-value group and a shift of -0.820916810211925 compared to the higher value group (LD is bimodally distributed)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

6 Positioning of genomic signals among other strains

To investigate whether the TRD regions in this strain are particularly unique among all TRD regions analyzed across strains,

cross_value<-cross
pca_df<-readRDS("~/data/trd/genomic_signals_pca.RDS")
pca_df<-mutate(pca_df, ID_thiscross=ifelse(cross==cross_value, ID, NA))
print(ggplot(pca_df,
      aes(PC1,PC2,color=cross==cross_value))+
geom_point()+geom_label(mapping=aes(label=ID_thiscross)))
## Warning: Removed 60 rows containing missing values (`geom_label()`).

7 GO terms

for(id in pull(TRD_regions, ID)){
    cat("## TRD region ",id)
    cat("\n") 
    
    if(!file.exists(paste0("/home/jnrunge/data/trd/GO_data","/",cross, ".", id, ".reducedTerms.RDS"))){
    next
    }

    GO_data<-readRDS(paste0("/home/jnrunge/data/trd/GO_data","/",cross, ".", id, ".reducedTerms.RDS"))
    
    treemapPlot(GO_data)
     cat("\n") 
    wordcloudPlot(GO_data, min.freq=1, colors="black")

 cat("\n")  
 cat("\n") 

}

7.1 TRD region 1

7.2 TRD region 3

7.3 TRD region 4

## Warning in wordcloud::wordcloud(d$word, d$freq, ...): glycerophospholipid could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): lmethionine could not be
## fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): salvage could not be fit
## on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): metabolic could not be
## fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): regulation could not be
## fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): methylthioadenosine could
## not be fit on page. It will not be plotted.
## Warning in wordcloud::wordcloud(d$word, d$freq, ...): transcription could not
## be fit on page. It will not be plotted.